hospital_data <- read.csv("./project_hospitals.csv", header = TRUE)
# View the first few rows of the dataset
head(hospital_data)
# View column names (features)
colnames(hospital_data)
## [1] "ID" "Lgth.of.Sty" "Age" "Inf.Risk" "R.Cul.Rat" "R.CX.ray.Rat" "N.Beds"
## [8] "Med.Sc.Aff" "Region" "Avg.Pat" "Avg.Nur" "Pct.Ser.Fac"
# Summary statistics of all variables
summary(hospital_data)
## ID Lgth.of.Sty Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds
## Min. : 1 Min. : 6.700 Min. :38.80 Min. :1.300 Min. : 1.60 Min. : 39.60 Min. : 29.0
## 1st Qu.: 29 1st Qu.: 8.340 1st Qu.:50.90 1st Qu.:3.700 1st Qu.: 8.40 1st Qu.: 69.50 1st Qu.:106.0
## Median : 57 Median : 9.420 Median :53.20 Median :4.400 Median :14.10 Median : 82.30 Median :186.0
## Mean : 57 Mean : 9.648 Mean :53.23 Mean :4.355 Mean :15.79 Mean : 81.63 Mean :252.2
## 3rd Qu.: 85 3rd Qu.:10.470 3rd Qu.:56.20 3rd Qu.:5.200 3rd Qu.:20.30 3rd Qu.: 94.10 3rd Qu.:312.0
## Max. :113 Max. :19.560 Max. :65.90 Max. :7.800 Max. :60.50 Max. :133.50 Max. :835.0
## Med.Sc.Aff Region Avg.Pat Avg.Nur Pct.Ser.Fac
## Min. :1.00 Min. :1.000 Min. : 20.0 Min. : 14.0 Min. : 5.70
## 1st Qu.:2.00 1st Qu.:2.000 1st Qu.: 68.0 1st Qu.: 66.0 1st Qu.:31.40
## Median :2.00 Median :2.000 Median :143.0 Median :132.0 Median :42.90
## Mean :1.85 Mean :2.363 Mean :191.4 Mean :173.2 Mean :43.16
## 3rd Qu.:2.00 3rd Qu.:3.000 3rd Qu.:252.0 3rd Qu.:218.0 3rd Qu.:54.30
## Max. :2.00 Max. :4.000 Max. :791.0 Max. :656.0 Max. :80.00
colSums(is.na(hospital_data))
## ID Lgth.of.Sty Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds Med.Sc.Aff Region
## 0 0 0 0 0 0 0 0 0
## Avg.Pat Avg.Nur Pct.Ser.Fac
## 0 0 0
# Visualizing Missing Values with Better Formatting
vis_miss(hospital_data) +
labs(title = "Visualizing Missing Data",
x = "",
y = "") +
theme_minimal() +
theme(
plot.title = element_text(size = 8, face = "bold"),
plot.subtitle = element_text(size = 8),
axis.text.x = element_text(angle = 90, hjust = 1)
)
Our data set hospitalData from the csv
HospitalDurations contains 113 observations of 12
variables. We have no missing values for any variable, thus we do not
need to impute for any observations. Out of the 12 variables, one of
them is an identifier variable ID, and will not be used for
predicting. The patient’s length of stay Lgth.of.Sty is the
variable we would like to predict for. Therefor, we have 10 potential
explanatory variables and 1 variable we are trying to predict for.
ggplot(hospital_data, aes(x = Lgth.of.Sty, fill = factor(1))) +
geom_histogram(binwidth = 2, color = "black") +
labs(x = "Length of Stay (Days)", y = "Count", title = "Distribution of Length of Stay") +
scale_fill_manual(values = base_palette[3]) +
guides(fill = "none")
We see some slight right skew for Lgt.of.Sty
base_data <- hospital_data |> select(-ID)
#base_data$RegionRaw <- base_data$Region
base_data$Region <- as.factor(base_data$Region) # convert Region from int to factor
base_data$Med.Sc.Aff <- as.factor(base_data$Med.Sc.Aff) # convert Med.Sc.Aff from int to factor
base_data$Region <- revalue(base_data$Region, c("1" = "NE", "2" = "NC", "3" = "S", "4" = "W"))
base_data$Med.Sc.Aff <- revalue(base_data$Med.Sc.Aff, c("1" = "Yes", "2" = "No"))
# labels for graphs
base_data_labels <- c(
"Length of Stay",
"Age",
"Infection Risk",
"Routine culturing ratio",
"Routine chest X-ray ratio",
"Number of beds",
"Medical School Affiliation",
"Region",
"Average Daily census",
"Number of nurses",
"Available facilities"
)
summary(base_data)
## Lgth.of.Sty Age Inf.Risk R.Cul.Rat R.CX.ray.Rat N.Beds Med.Sc.Aff Region
## Min. : 6.700 Min. :38.80 Min. :1.300 Min. : 1.60 Min. : 39.60 Min. : 29.0 Yes:17 NE:28
## 1st Qu.: 8.340 1st Qu.:50.90 1st Qu.:3.700 1st Qu.: 8.40 1st Qu.: 69.50 1st Qu.:106.0 No :96 NC:32
## Median : 9.420 Median :53.20 Median :4.400 Median :14.10 Median : 82.30 Median :186.0 S :37
## Mean : 9.648 Mean :53.23 Mean :4.355 Mean :15.79 Mean : 81.63 Mean :252.2 W :16
## 3rd Qu.:10.470 3rd Qu.:56.20 3rd Qu.:5.200 3rd Qu.:20.30 3rd Qu.: 94.10 3rd Qu.:312.0
## Max. :19.560 Max. :65.90 Max. :7.800 Max. :60.50 Max. :133.50 Max. :835.0
## Avg.Pat Avg.Nur Pct.Ser.Fac
## Min. : 20.0 Min. : 14.0 Min. : 5.70
## 1st Qu.: 68.0 1st Qu.: 66.0 1st Qu.:31.40
## Median :143.0 Median :132.0 Median :42.90
## Mean :191.4 Mean :173.2 Mean :43.16
## 3rd Qu.:252.0 3rd Qu.:218.0 3rd Qu.:54.30
## Max. :791.0 Max. :656.0 Max. :80.00
cor_matrix = cor(base_data |> select(-Region, -Med.Sc.Aff))
ggcorrplot(cor_matrix, lab = TRUE)
Length of Stay correlated more strongly with: + Infection Risk, + Average Patients.
pairs(base_data |> select(-Region, -Med.Sc.Aff), panel = function(x, y) {
points(x, y, pch = 16, col = "blue")
abline(lm(y ~ x), col = "red")
})
ggpairs(base_data, lower = list(continuous = wrap("smooth", method = "lm", color = "blue")))
The variables Number of Beds, Average Patients, Average Nurses, and Percentage of service facilities; exhibit high intercorrelation, particularly between Number of Beds and Average Patients (0.981), which could introduce collinearity in a predictive model.
An important strong correlation that appears to be an important factor in hospital stays is Infection Risk (0.533).
Both Culture Rate and Chest X-Ray Rate displays moderate correlation with Infection Risk; thhis suggests an intuitive relationship: as infection risk increases, more diagnostic tests are performed. Infection Risk may also serve as a useful predictor for Length of Stay, as higher infection risks could lead to prolonged hospital stays.
cor_data <- base_data %>% select(-Med.Sc.Aff, -Region)
lin_log_cor_data <- log(cor_data)
lin_log_cor_data$Lgth.of.Sty <- cor_data$Lgth.of.Sty
log_lincor_data <- cor_data %>% select(-Lgth.of.Sty)
log_lincor_data$logLgth.of.Sty <- log(cor_data$Lgth.of.Sty)
correlation_matrix <- cor(cor_data, method = "pearson")
ggcorrplot(correlation_matrix, lab = TRUE, title = "Linear-Linear")
lin_log_correlation_matrix <- cor(lin_log_cor_data, method = "pearson")
ggcorrplot(lin_log_correlation_matrix, lab = TRUE, title = "Linear-Log")
log_lin_correlation_matrix <- cor(log_lincor_data, method = "pearson")
ggcorrplot(log_lin_correlation_matrix, lab = TRUE, title = "Log-Linear")
long_data <- base_data %>%
pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value")
ggplot(long_data, aes(x = Value, y = Lgth.of.Sty)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess") +
facet_wrap(~ Variable, scales = "free_x") +
theme_minimal()
long_data <- base_data %>%
pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value")
ggplot(long_data, aes(x = log(Value), y = Lgth.of.Sty)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "loess") +
facet_wrap(~ Variable, scales = "free_x") +
theme_minimal()
# Define color palette
plot_colors <- RColorBrewer::brewer.pal(n = 16, name = "Set1")
# Convert data to long format for faceting
long_data <- base_data %>%
pivot_longer(cols = c(2:6, 9:11), names_to = "Variable", values_to = "Value") %>%
filter(Value > 0, Lgth.of.Sty > 0)
# Create the faceted ggplot
ggplot(long_data, aes(x = log(Value), y = log(Lgth.of.Sty))) +
geom_point(alpha = 0.6, aes(color = Variable)) +
geom_smooth(method = "loess", se = FALSE, aes(color = Variable)) +
scale_color_manual(values = plot_colors) +
facet_wrap(~ Variable, scales = "free_x") +
ggtitle("Log-Transformed Variables vs Log Length of Stay") +
xlab("Log Predictor Variables") +
ylab("Log Length of Stay (Days)") +
theme_minimal() +
theme(legend.position = "none")
region_patient <- ggplot(data = base_data, aes(x = Region, y = Lgth.of.Sty, colour = Med.Sc.Aff)) +
geom_boxplot() +
ggtitle("Region vs Patient Length of Stay", "Grouped by Medical School Affiliation")
med_assoc <- ggplot(data = base_data, aes(x = Med.Sc.Aff, y = Lgth.of.Sty, colour = Region)) +
geom_boxplot() +
ggtitle("Med. School Aff. vs Pat, Length of Stay", "Grouped by Region")
grid.arrange(grobs = list(med_assoc, region_patient), ncol = 1)
By looking at the distributions (via the boxplots), we can see that there are some differences between a patient’s length of stay if we group by Region (or by Medical School Affiliation).
If grouping by Medical school affiliation: Length of stay between the regions is fairly similar on average if there is an affiliation for medical school, but the distributions are much more different between the regions. If there is no affiliation, then the distributions are a bit more similar, but the averages are different between each region.
If grouping by Region: Length of stay on average tends to be similar between the regions (being slightly less in the W and S regions) with NE having a wider distribution of length of stay. Interesting to note that medical school affiliation tends to have lower length of stay if the affiliation is none.
plot_list <- list()
indexes <- c(2:6, 9:11)
for (i in seq_along(indexes)) {
x_var <- names(base_data)[indexes[i]]
temp_plot <- ggplot(data = base_data, aes_string(x = x_var, y = "Lgth.of.Sty", color = "Region")) +
geom_point() +
geom_smooth() +
ggtitle(paste0(base_data_labels[i], " vs Length of Stay"), "Grouped by Region") +
xlab(base_data_labels[i]) +
ylab("Length of Stay (Days)") +
theme(legend.position = "right")
plot_list[[length(plot_list) + 1]] <- temp_plot
}
grid.arrange(grobs = plot_list, ncol = 2)
plot_list <- list()
indexes <- c(2:6, 9:11)
for (i in seq_along(indexes)) {
x_var <- names(base_data)[indexes[i]]
temp_plot <- ggplot(data = base_data, aes_string(x = x_var, y = "Lgth.of.Sty", color = "Med.Sc.Aff")) +
geom_point() +
geom_smooth() +
ggtitle(paste0(base_data_labels[i], " vs Length of Stay"), "Grouped by Medical School Affiliation") +
xlab(base_data_labels[i]) +
ylab("Length of Stay (Days)") +
theme(legend.position = "right")
plot_list[[length(plot_list) + 1]] <- temp_plot
}
grid.arrange(grobs = plot_list, ncol = 2)
all_vars_model = lm(Lgth.of.Sty ~ ., data = base_data)
par(mfrow = c(2,2))
plot(all_vars_model)
summary(all_vars_model)
##
## Call:
## lm(formula = Lgth.of.Sty ~ ., data = base_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3048 -0.6608 -0.0272 0.5862 6.3001
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.322292 1.782122 1.864 0.065222 .
## Age 0.079922 0.028266 2.827 0.005668 **
## Inf.Risk 0.439665 0.127298 3.454 0.000812 ***
## R.Cul.Rat 0.005546 0.015982 0.347 0.729299
## R.CX.ray.Rat 0.012688 0.007147 1.775 0.078892 .
## N.Beds -0.004851 0.003603 -1.346 0.181224
## Med.Sc.AffNo -0.266644 0.441089 -0.605 0.546872
## RegionNC -0.812966 0.351406 -2.313 0.022744 *
## RegionS -1.158277 0.351704 -3.293 0.001370 **
## RegionW -1.880560 0.444136 -4.234 5.1e-05 ***
## Avg.Pat 0.015182 0.004424 3.432 0.000872 ***
## Avg.Nur -0.005891 0.002218 -2.656 0.009203 **
## Pct.Ser.Fac -0.012179 0.013774 -0.884 0.378698
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.231 on 100 degrees of freedom
## Multiple R-squared: 0.6299, Adjusted R-squared: 0.5855
## F-statistic: 14.18 on 12 and 100 DF, p-value: < 2.2e-16
The diagnostic plots suggest the linear model fits reasonably well.
The Residuals vs. Fitted and Scale-Location plots show residuals scattered around zero, with no strong signs of nonlinearity or heteroskedasticity. However, Observation 47 stands out, showing a large residual across multiple plots. In the Q–Q plot, most points follow the regression line, but Observation 47 appears on the right tail, deviating from normality.
Leverage diagnostics indicate that most data points are well-standardized around the fitted line. Observation 47 is near the threshold for Cook’s distance, suggesting potential influence, while Observation 112 exhibits high leverage without strong influence. These points may warrant further investigation to understand their impact on the model.
No major assumption violations are evident, but examining these observations could help assess model robustness.
vif(all_vars_model)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.176172 1 1.084515
## Inf.Risk 2.154694 1 1.467888
## R.Cul.Rat 1.978520 1 1.406599
## R.CX.ray.Rat 1.416265 1 1.190070
## N.Beds 35.699204 1 5.974881
## Med.Sc.Aff 1.855334 1 1.362107
## Region 1.715222 3 1.094091
## Avg.Pat 34.211423 1 5.849053
## Avg.Nur 7.055523 1 2.656224
## Pct.Ser.Fac 3.241812 1 1.800503
Number of Beds and Average Patients have extremely high VIFs, suggesting high multicollinearity. This makes sense, as the more room a hospital has (number of beds) then the more patients they can have on average. Average number of full time nurses has potential collinearity as well, which also makes sense as a hospital with more beds will likely have more nursing staff to take care of the patients.
test_model <- Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region
test_model_fit <- lm(test_model, data = base_data)
par(mfrow = c(2,2))
plot(test_model_fit)
par(mfrow = c(1,1))
In comparing the first model (all variables model) and this model, the residuals vs. fitted plot now shows points more evenly scattered around the reference line, indicating fewer systematic patterns in the residuals and a flatter loess curve.
The Q–Q plot remains largely linear, though observations such as #47 and #112 still appear in the right tail, suggesting they continue to exert some influence.
The scale–location plot demonstrates that the spread of residuals has become more consistent across fitted values, with a flatter red line that signals more homoscedasticity than before.
The residuals vs. leverage plot, #47 no longer rises as close to the typical Cook’s distance threshold, while #112 has shifted from being primarily a high‐leverage point to one with a moderately large residual—though neither appears as influential as in the initial model.
summary(test_model_fit)
##
## Call:
## lm(formula = test_model, data = base_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7517 -0.8456 -0.0709 0.5848 6.9770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.025158 1.951126 2.063 0.04165 *
## Age 0.078458 0.031248 2.511 0.01362 *
## Inf.Risk 0.576846 0.136458 4.227 5.16e-05 ***
## R.Cul.Rat -0.013363 0.017119 -0.781 0.43682
## R.CX.ray.Rat 0.011462 0.007911 1.449 0.15045
## Avg.Nur 0.001031 0.001621 0.636 0.52605
## Pct.Ser.Fac -0.003332 0.014373 -0.232 0.81716
## Med.Sc.AffNo -0.970771 0.460747 -2.107 0.03757 *
## RegionNC -0.962513 0.374275 -2.572 0.01156 *
## RegionS -1.135900 0.376770 -3.015 0.00324 **
## RegionW -2.511570 0.459006 -5.472 3.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.363 on 102 degrees of freedom
## Multiple R-squared: 0.5368, Adjusted R-squared: 0.4914
## F-statistic: 11.82 on 10 and 102 DF, p-value: 2.727e-13
vif(test_model_fit)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.171453 1 1.082337
## Inf.Risk 2.017887 1 1.420523
## R.Cul.Rat 1.850070 1 1.360173
## R.CX.ray.Rat 1.414378 1 1.189276
## Avg.Nur 3.071224 1 1.752491
## Pct.Ser.Fac 2.877054 1 1.696188
## Med.Sc.Aff 1.649862 1 1.284470
## Region 1.381390 3 1.055325
numeric_data <- base_data |> select(where(is.numeric))
stats <- numeric_data |>
dplyr::summarise(
dplyr::across(
.cols = everything(),
.fns = list(
mean = ~ mean(.x, na.rm = TRUE),
median = ~ median(.x, na.rm = TRUE)
)
)
) |>
pivot_longer(
cols = everything(),
names_to = c("variable", ".value"),
names_pattern = "(.*)_(mean|median)"
)
outliers_diff <- numeric_data[c(47, 112), ] |>
as_tibble(rownames = "obs_id") %>%
pivot_longer(
cols = -obs_id,
names_to = "variable",
values_to = "value"
) |>
left_join(stats, by = "variable") |>
dplyr::mutate(
diff_from_mean = value - mean,
diff_from_median = value - median
)
final_table <- outliers_diff |>
pivot_wider(
id_cols = c(variable, mean, median),
names_from = obs_id,
values_from = c(value, diff_from_mean, diff_from_median)
) |>
dplyr::mutate(dplyr::across(where(is.numeric), ~ round(.x, 3))) |>
dplyr::rename(
"#47 Value" = value_47,
"#47 Mean Distance" = diff_from_mean_47,
"#47 Median Distance" = diff_from_median_47,
"#112 Value" = value_112,
"#112 Mean Distance" = diff_from_mean_112,
"#112 Median Distance" = diff_from_median_112,
) |>
dplyr::select(
variable, mean, median,
`#112 Value`, `#47 Value`, `#112 Mean Distance`, `#47 Mean Distance`, `#112 Median Distance`, `#47 Median Distance`,
)
datatable(final_table, options = list(dom = "t", width = "100%", scrollX = TRUE))
test_model_remove <- Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region
test_model_remove_fit <- lm(test_model_remove, data = base_data[c(-112, -47), ])
par(mfrow = c(2,2))
plot(test_model_remove_fit)
par(mfrow = c(1,1))
After removing observations #47 and #112, the model’s residual diagnostics show notable improvements.
Residuals vs. Fitted plot indicates a reduction in heteroscedasticity, as the variance of residuals is now more evenly spread with less curvature in the trend.
The Q-Q plot aligns more closely with the normal distribution, suggesting an improvement in the normality assumption, as the extreme deviations from #47 and #112 are no longer distorting the distribution.
In the Scale-Location plot, variance appears more stable, reinforcing the improved homoscedasticity.
Finally, the Residuals vs. Leverage plot confirms that these observations had high leverage and strong influence on model coefficients, as their removal results in a more balanced distribution of leverage across data points.
summary(test_model_remove_fit)
##
## Call:
## lm(formula = test_model_remove, data = base_data[c(-112, -47),
## ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.19321 -0.65320 -0.06784 0.51335 3.01070
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.228576 1.466415 3.566 0.000559 ***
## Age 0.054064 0.023485 2.302 0.023404 *
## Inf.Risk 0.469248 0.102685 4.570 1.40e-05 ***
## R.Cul.Rat -0.004346 0.012845 -0.338 0.735797
## R.CX.ray.Rat 0.008063 0.005934 1.359 0.177264
## Avg.Nur 0.001118 0.001213 0.922 0.358648
## Pct.Ser.Fac -0.002924 0.010752 -0.272 0.786261
## Med.Sc.AffNo -0.689680 0.349635 -1.973 0.051306 .
## RegionNC -0.544629 0.283414 -1.922 0.057494 .
## RegionS -0.758340 0.284514 -2.665 0.008968 **
## RegionW -2.069147 0.346224 -5.976 3.53e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.018 on 100 degrees of freedom
## Multiple R-squared: 0.567, Adjusted R-squared: 0.5237
## F-statistic: 13.09 on 10 and 100 DF, p-value: 2.325e-14
vif(test_model_remove_fit)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.158175 1 1.076185
## Inf.Risk 1.977350 1 1.406183
## R.Cul.Rat 1.850144 1 1.360200
## R.CX.ray.Rat 1.388417 1 1.178311
## Avg.Nur 3.005297 1 1.733579
## Pct.Ser.Fac 2.836364 1 1.684151
## Med.Sc.Aff 1.615906 1 1.271183
## Region 1.352387 3 1.051599
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2))
}
predictions_before <- predict(test_model_fit, base_data)
rmse_lm_before <- rmse(predictions_before, base_data$Lgth.of.Sty)
base_data_filtered <- base_data[c(-112, -47), ]
predictions_after <- predict(test_model_remove_fit, base_data_filtered)
rmse_lm_after <- rmse(predictions_after, base_data_filtered$Lgth.of.Sty)
sprintf("RMSE without removing = %.3f -- RMSE without 112 and 47 = %.3f", rmse_lm_before, rmse_lm_after)
## [1] "RMSE without removing = 1.295 -- RMSE without 112 and 47 = 0.966"
The reduction in RMSE from 1.295 to 0.966 after removing observations 47 and 112 suggests that these points were contributing to higher model error. Their removal improves the model’s predictive accuracy, indicating that it generalizes better to new data and produces more reliable estimates
Additionally by removing highly correlated variables such as Number of Beds and Average Patients, we’ve substantially lowered variance inflation. This indicates that our changes have mitigated multicollinearity and improved the stability and interpretability of the model.
confint(test_model_remove_fit)
## 2.5 % 97.5 %
## (Intercept) 2.319250967 8.137900348
## Age 0.007470692 0.100657665
## Inf.Risk 0.265523483 0.672973359
## R.Cul.Rat -0.029830821 0.021138075
## R.CX.ray.Rat -0.003709698 0.019836041
## Avg.Nur -0.001287584 0.003524206
## Pct.Ser.Fac -0.024256032 0.018408909
## Med.Sc.AffNo -1.383346930 0.003986702
## RegionNC -1.106914726 0.017657181
## RegionS -1.322807364 -0.193872415
## RegionW -2.756046275 -1.382247990
Our model Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + Avg.Nur + Pct.Ser.Fac + Med.Sc.Aff + Region aims to predict Length of Stay (Lgth.of.Sty) based on key hospital and patient characteristics.
interaction_data <- base_data[c(-112, -47), ]
anova_model <- lm(Lgth.of.Sty ~ ., data = interaction_data)
summary(anova_model)
##
## Call:
## lm(formula = Lgth.of.Sty ~ ., data = interaction_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.07731 -0.57223 -0.08947 0.61897 3.06561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.9450368 1.4707217 3.362 0.001103 **
## Age 0.0578446 0.0233165 2.481 0.014811 *
## Inf.Risk 0.4226656 0.1053017 4.014 0.000117 ***
## R.Cul.Rat 0.0033790 0.0133374 0.253 0.800530
## R.CX.ray.Rat 0.0086477 0.0058544 1.477 0.142845
## N.Beds -0.0008609 0.0031127 -0.277 0.782699
## Med.Sc.AffNo -0.4888113 0.3610531 -1.354 0.178899
## RegionNC -0.5864174 0.2883010 -2.034 0.044649 *
## RegionS -0.8703365 0.2906440 -2.995 0.003480 **
## RegionW -1.8920505 0.3657131 -5.174 1.22e-06 ***
## Avg.Pat 0.0057002 0.0043733 1.303 0.195489
## Avg.Nur -0.0023495 0.0019917 -1.180 0.241002
## Pct.Ser.Fac -0.0095188 0.0113323 -0.840 0.402968
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.003 on 98 degrees of freedom
## Multiple R-squared: 0.5878, Adjusted R-squared: 0.5373
## F-statistic: 11.64 on 12 and 98 DF, p-value: 3.301e-14
anova_results <- anova(anova_model)
vif(anova_model)
## GVIF Df GVIF^(1/(2*Df))
## Age 1.175247 1 1.084088
## Inf.Risk 2.140653 1 1.463097
## R.Cul.Rat 2.053431 1 1.432980
## R.CX.ray.Rat 1.391222 1 1.179501
## N.Beds 36.769932 1 6.063822
## Med.Sc.Aff 1.773930 1 1.331890
## Region 1.771611 3 1.100005
## Avg.Pat 43.286785 1 6.579269
## Avg.Nur 8.346041 1 2.888952
## Pct.Ser.Fac 3.243355 1 1.800932
High VIF values: N.Beds (36.77), Avg.Pat (43.28), and Avg.Nur (8.34). Suggesting multicollinearity is a concern for these variables.
mse <- anova_results["Residuals", "Mean Sq"]
rmse_anova <- sqrt(mse)
sprintf("Anova RMSE %.3f", rmse_anova)
## [1] "Anova RMSE 1.003"
Interpretations:
The ANOVA model indicates that significant predictors of Length of Stay (Lgth.of.Sty) include Age (p = 0.0148), where a 1 year increase is associated with a 0.0578-day increase, and Infection Risk (p = 0.0001), significantly increasing the length of stay. Regional differences also play a role, with patients in the NC region staying 0.586 days less (p = 0.0446), those in the South staying 0.870 days less (p = 0.0034), and those in the West experiencing the greatest reduction of 1.892 days (p < 0.001). On the other hand, several predictors, including R.Cul.Rat, R.CX.ray.Rat, N.Beds, Med.Sc.AffNo, Avg.Pat, Avg.Nur, and Pct.Ser.Fac, do not significantly impact the length of stay (p > 0.05), suggesting that they may not be strong predictors and may need to be removed from the model.
# Prepare data: Separate predictors (X) and target variable (y)
X <- model.matrix(Lgth.of.Sty ~ . - 1, data = interaction_data)
y <- interaction_data$Lgth.of.Sty
# Perform LASSO regression with cross-validation
cv_lasso <- cv.glmnet(X, y, alpha = 1, nfolds = 10)
# Plot cross-validation results
plot(cv_lasso)
# Find the optimal lambda (penalty parameter)
best_lambda <- cv_lasso$lambda.min
sprintf("Optimal λ: %.5f", best_lambda)
## [1] "Optimal λ: 0.00702"
# Fit final LASSO model using optimal lambda
lasso_model <- glmnet(X, y, alpha = 1, lambda = best_lambda)
# Extract feature coefficients
lasso_coefficients <- coef(lasso_model)
# Convert coefficients to a named vector
lasso_coefficients <- as.vector(lasso_coefficients)
names(lasso_coefficients) <- rownames(coef(lasso_model))
# Print Intercept
intercept <- lasso_coefficients["(Intercept)"]
sprintf("Intercept: %.3f", intercept)
## [1] "Intercept: 4.518"
# Print selected features and their coefficients
selected_features <- lasso_coefficients[lasso_coefficients != 0]
lasso_table <- tibble(
Feature = names(selected_features),
Coefficient = round(selected_features, 3) # Round to 3 decimals for readability
)
lasso_table
# Compute RMSE using cross-validation
predictions <- predict(lasso_model, X)
rmse_lasso <- sqrt(mean((y - predictions)^2))
# Compute R-squared
ss_total <- sum((y - mean(y))^2)
ss_residual <- sum((y - predictions)^2)
r_squared <- 1 - (ss_residual / ss_total)
# Compute Adjusted R-squared
n <- length(y)
p <- length(selected_features) - 1
adj_r_squared <- 1 - ((1 - r_squared) * (n - 1) / (n - p - 1))
sprintf("RMSE %.3f | Adjusted R-squared: %.3f with optimal λ: %.5f", rmse_lasso, adj_r_squared, best_lambda)
## [1] "RMSE 0.944 | Adjusted R-squared: 0.540 with optimal λ: 0.00702"
After analyzing outliers we are going ahead to work without both outliers (observations 47 and 112)
cleaned_obj_2 <- base_data[c(-112, -47), ]
set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_all_model <- train(
Lgth.of.Sty ~ .,
data = cleaned_obj_2,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
print(knn_all_model)
## k-Nearest Neighbors
##
## 111 samples
## 10 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 88, 90, 88, 89, 89
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.636425 0.1570248 1.341632
## 2 1.422562 0.1693380 1.120702
## 3 1.433125 0.1277156 1.120258
## 4 1.361358 0.1735495 1.052855
## 5 1.340433 0.1906822 1.066184
## 6 1.338491 0.1898658 1.066725
## 7 1.308330 0.2283080 1.063999
## 8 1.309400 0.2201435 1.045493
## 9 1.306700 0.2295606 1.036747
## 10 1.325767 0.2110012 1.047278
## 20 1.314614 0.2099153 1.041218
## 30 1.291736 0.2333301 1.027280
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 30.
plot(knn_all_model)
knn_results <- knn_all_model$results
best_k <- knn_all_model$bestTune$k
rmse_knn_all <- knn_results$RMSE[knn_results$k == best_k]
sprintf("Best k: %d, Best MSE: %.3f", best_k, rmse_knn_all)
## [1] "Best k: 30, Best MSE: 1.292"
When looking at all numeric predictors, the best k for RMSE is 30, however, we see that at k = 9, we also have quite a low RMSE as well.
set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_v1_model <- train(
Lgth.of.Sty ~ Age + Inf.Risk + R.Cul.Rat + R.CX.ray.Rat + N.Beds + Med.Sc.Aff,
data = cleaned_obj_2,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
print(knn_v1_model)
## k-Nearest Neighbors
##
## 111 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 88, 90, 88, 89, 89
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.607190 0.1527092 1.273367
## 2 1.415132 0.1777442 1.142162
## 3 1.355112 0.2226904 1.101688
## 4 1.343746 0.2298662 1.096834
## 5 1.314924 0.2434483 1.081474
## 6 1.303935 0.2417774 1.078696
## 7 1.284000 0.2547728 1.061646
## 8 1.274059 0.2704031 1.045664
## 9 1.279255 0.2676272 1.035398
## 10 1.267711 0.2791885 1.018944
## 20 1.307496 0.2181723 1.050198
## 30 1.288398 0.2367415 1.022785
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 10.
plot(knn_v1_model)
knn_results <- knn_v1_model$results
best_k <- knn_v1_model$bestTune$k
rmse_knn_v1 <- knn_results$RMSE[knn_results$k == best_k]
sprintf("Best k: %d, Best MSE: %.3f", best_k, rmse_knn_v1)
## [1] "Best k: 10, Best MSE: 1.268"
Similar to looking at all predictors, for the predictors our LASSO model chose, we see again that at k = 30, our RMSE is lowest, however, at k = 9 we have a low RMSE as well.
set.seed(1234)
train_control <- trainControl(method = "cv", number = 5)
knn_v2_model <- train(
Lgth.of.Sty ~ Age + Inf.Risk + Region + Med.Sc.Aff,
data = cleaned_obj_2,
method = "knn",
trControl = train_control,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
print(knn_v2_model)
## k-Nearest Neighbors
##
## 111 samples
## 4 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 88, 90, 88, 89, 89
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 1.794866 0.1190193 1.396178
## 2 1.435927 0.2136443 1.130902
## 3 1.366410 0.2279952 1.085376
## 4 1.360840 0.2363762 1.091787
## 5 1.325933 0.2258781 1.056191
## 6 1.266148 0.2922350 1.015717
## 7 1.296658 0.2620468 1.048769
## 8 1.336213 0.1995070 1.087715
## 9 1.327809 0.2070321 1.080879
## 10 1.344680 0.1797312 1.084815
## 20 1.338769 0.2516622 1.077299
## 30 1.398054 0.1439504 1.126062
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 6.
plot(knn_v2_model)
knn_results <- knn_v2_model$results
best_k <- knn_v2_model$bestTune$k
rmse_knn_v2 <- knn_results$RMSE[knn_results$k == best_k]
sprintf("Best k: %d, Best MSE: %.3f", best_k, rmse_knn_v2)
## [1] "Best k: 6, Best MSE: 1.266"
Interestingly, for the predictors of Age, Inf.Risk, Med.Sc.Aff, and Region, k = 10 has the best RMSE.
lm_all_predictors <- attr(terms(test_model_fit), "term.labels")
lm_no_outliers_predictors <- attr(terms(test_model_remove_fit), "term.labels")
anova_predictors <- rownames(anova_results)
lasso_predictors <- lasso_table$Feature
knn_all_predictors <- attr(knn_all_model$terms, "term.labels")
knn_v1_predictors <- attr(knn_v1_model$terms, "term.labels")
knn_v2_predictors <- attr(knn_v2_model$terms, "term.labels")
model_comparison <- tibble(
Model = c(
"Linear Model (All Predictors)",
"Linear Model (No Outliers)",
"ANOVA",
"LASSO",
"k-NN (All Predictors)",
"k-NN (Version 1)",
"k-NN (Version 2)"
),
RMSE = c(
rmse_lm_before,
rmse_lm_after,
rmse_anova,
rmse_lasso,
rmse_knn_all,
rmse_knn_v1,
rmse_knn_v2
) |> round(3),
Features = c(
str_c(lm_all_predictors, collapse = " + "),
str_c(lm_no_outliers_predictors, collapse = " + "),
str_c(anova_predictors[anova_predictors != "Residuals"], collapse = " + "),
str_c(lasso_predictors, collapse = " + "),
str_c(knn_all_predictors, collapse = " + "),
str_c(knn_v1_predictors, collapse = " + "),
str_c(knn_v2_predictors, collapse = " + ")
)
) |> arrange(RMSE)
datatable(model_comparison, options = list(
dom = "t",
autoWidth = TRUE,
columnDefs = list(
list(width = "120px", targets = 1, className = "dt-center"),
list(targets = 2, render = JS(
"function(data, type, row) {
return type === 'display' && data.length > 50 ?
'<span style=\"white-space:normal;\">' + data + '</span>' :
data;
}"
))
)
), rownames = FALSE)